Raw data

Dataset downloaded from Arkinglab website in the Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism section.

Load and annotate data

# Load csvs
datExpr = read.delim('./../Data/datExpr.csv')
datMeta = read.delim('./../Data/datPheno.csv')

# Create dataset with gene information
datGenes = data.frame('Ensembl_ID' = substr(datExpr$Gene, 1, 15), 
                      'gene_name' = substring(datExpr$Gene, 17))
rownames(datExpr) = datGenes$Ensembl_ID
datExpr$Gene = NULL


### CLEAN METADATA DATA FRAME
datMeta = datMeta %>% dplyr::select('ID', 'case', 'sampleid', 'brainregion', 'positiononplate', 
                                       'Gender', 'Age', 'SiteHM', 'RIN', 'PMI', 'Dx')
datMeta$brainregion = substr(datMeta$ID, 1, 4)
datMeta = datMeta %>% mutate(brain_lobe = ifelse(brainregion=='ba19', 'Occipital', 'Frontal'),
                             Diagnosis = ifelse(Dx=='Autism', 'ASD', 'CTL'))

# Convert Diagnosis variable to factor
datMeta$Diagnosis = factor(datMeta$Diagnosis, levels=c('CTL','ASD'))

# sampleid is actually subject ID
datMeta = datMeta %>% dplyr::rename(Subject_ID = sampleid)


# SFARI Genes
SFARI_genes = read_csv('./../../../Models/FirstPUModel/Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]

Check sample composition

Data description taken from the paper Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism:

Transcriptomes from 104 human brain cortical tissue samples were resolved using next-generation RNA sequencing technology at single-gene resolution and through co-expressing gene clusters or modules. Multiple cortical tissues corresponding to Brodmann Area 19 (BA19), Brodmann Area 10 (BA10) and Brodmann Area 44 (BA44) were sequenced in 62, 14 and 28 samples, respectively, resulting in a total of 57 (40 unique individuals) control and 47 (32 unique individuals) autism samples.

Note: They analysed all of the regions together

Brain tissue: Frozen brain samples were acquired through the Autism Tissue Program, with samples originating from two different sites: the Harvard Brain Tissue Resource Center and the NICHD Brain and Tissue Bank at the University of Maryland (Gandal’s data were obtained also from the Autism Tissue Program, specifically from the Harvard Brain Bank)

Sequenced using Illumina’s HiSeq 2000 (Gandal used Illumina HiSeq 2500) Check if they are compatible

print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Dataset includes 62069 genes from 120 samples belonging to 72 different subjects."

Filtering criteria previous to level of expression

# In the paper they talk about an original number of 110 samples and dropping 6 because of low gene coverage, resulting in 104 samples (which are the ones that are included in datMeta), but the expression dataset has 120 samples.

no_metadata_samples = colnames(datExpr)[! colnames(datExpr) %in% datMeta$ID]
no_metadata_subjects = unique(substring(no_metadata_samples, 6))

add_metadata_subjects = no_metadata_subjects[no_metadata_subjects %in% datMeta$Subject_ID]
add_metadata_samples = no_metadata_samples[grepl(paste(add_metadata_subjects, collapse='|'),
                                                 no_metadata_samples)]

for(sample in add_metadata_samples){
  new_row = datMeta %>% filter(Subject_ID == strsplit(sample,'\\.')[[1]][2]) %>% dplyr::slice(1) %>%
            mutate(ID = sample, 
                   brainregion = strsplit(sample,'\\.')[[1]][1],
                   brain_lobe = ifelse(strsplit(sample,'\\.')[[1]][1]=='ba19','Occipital','Frontal'))
  datMeta = rbind(datMeta, new_row)
}

rm(no_metadata_subjects, no_metadata_samples, add_metadata_subjects, add_metadata_samples, sample, new_row)


# And remove the samples that have no metadata and don't have any other samples that do have metadata.

keep = substring(colnames(datExpr), 6) %in% datMeta$Subject_ID

datExpr = datExpr[,keep]

# Match order of datExpr columns and datMeta rows
datMeta = datMeta[match(colnames(datExpr), datMeta$ID),]

# Check they are in the same order
if(!all(colnames(datExpr) == datMeta$ID)){
  cat('order of samples don\'t match between datExpr and datMeta!')
}

rm(keep)


#######################################################################################################
# Annotate genes with BioMart information

# Cannot find 1580 ensembl ids using the archive that finds the largest number of genes is feb2014 (I cannot find the missing ones in any other archive version).


datGenes_original = datGenes

getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')

mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org')

# Getting gene info using Ensembl IDs
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position

missing_genes = datGenes_original$gene_name[is.na(datGenes$ensembl_gene_id)]

SFARI_genes_lost = missing_genes[missing_genes %in% SFARI_genes$`gene-symbol` &
                   !missing_genes %in% datGenes$external_gene_id] %>% unique

# Remove genes that did not return any results in bioMart
datGenes = datGenes %>% filter(!is.na(ensembl_gene_id))
datExpr = datExpr[rownames(datExpr) %in% datGenes$ensembl_gene_id,]

rm(getinfo, mart, datGenes_original)

#######################################################################################################
# Filtering

# 1. Filter genes with start or end position missing
to_keep = !is.na(datGenes$length)

datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id

# 2. Filter genes that do not encode any protein
if(!all(rownames(datExpr)==rownames(datGenes))) print('!!! gene rownames do not match!!!')

to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id

# 3. Filter genes with low expression levels
# 3.1 Remove genes with zero expression in all of the samples
to_keep = rowSums(datExpr)>0
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## [1] "Removed 2811 genes, 19552 remaining"
print(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## [1] "968 SFARI genes remaining"
# Save datasets before level of expression filtering
datExpr_original = datExpr
datGenes_original = datGenes
datMeta_original = datMeta

Filter criteria by mean level of expression

  • Filtering outlier samples (they aren’t many)

  • Creating DESeq object and normalising using vst transformation

thresholds = c(0.5, 1, seq(2,10), 12, 15, 17, 20, 30, 40, 50, 60, 70, 80, 100)

for(threshold in thresholds){
  
  datMeta = datMeta_original
  datExpr = datExpr_original
  datGenes = datGenes_original
  
  cat(paste0('\n\nFiltering with threshold: ', threshold,'\n'))
  to_keep = rowMeans(datExpr)>threshold
  datGenes = datGenes[to_keep,]
  datExpr = datExpr[to_keep,]
  
  # Filter outlier samples
  absadj = datExpr %>% bicor %>% abs
  netsummary = fundamentalNetworkConcepts(absadj)
  ku = netsummary$Connectivity
  z.ku = (ku-mean(ku))/sqrt(var(ku))
  
  to_keep = abs(z.ku)<2
  datMeta = datMeta[to_keep,]
  datExpr = datExpr[,to_keep]
  
  cat(paste0('Removing ', sum(!to_keep), ' samples\n'))
  
  rm(absadj, netsummary, ku, z.ku, to_keep)
  
  
  # Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
  counts = datExpr %>% as.matrix
  rowRanges = GRanges(datGenes$chromosome_name,
                    IRanges(datGenes$start_position, width=datGenes$length),
                    strand=datGenes$strand,
                    feature_id=datGenes$ensembl_gene_id)
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
  dds = DESeqDataSet(se, design =~Diagnosis)
  
  # Perform vst
  vsd = vst(dds)
  
  datExpr_vst = assay(vsd)
  datMeta_vst = colData(vsd)
  datGenes_vst = rowRanges(vsd)
  
  rm(counts, rowRanges, se, vsd)
  
  # Save summary results in dataframe
  if(threshold == thresholds[1]){
    mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
                                 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
  } else {
    new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
                                 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
    mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
  }
}  
## 
## 
## Filtering with threshold: 0.5
## alpha: 1.000000
## Removing 4 samples
## 
## 
## Filtering with threshold: 1
## alpha: 1.000000
## Removing 4 samples
## 
## 
## Filtering with threshold: 2
## alpha: 1.000000
## Removing 3 samples
## 
## 
## Filtering with threshold: 3
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 4
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 5
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 6
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 7
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 8
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 9
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 10
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 12
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 15
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 17
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 20
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 30
## alpha: 1.000000
## Removing 2 samples
## 
## 
## Filtering with threshold: 40
## alpha: 1.000000
## Removing 3 samples
## 
## 
## Filtering with threshold: 50
## alpha: 1.000000
## Removing 3 samples
## 
## 
## Filtering with threshold: 60
## alpha: 1.000000
## Removing 3 samples
## 
## 
## Filtering with threshold: 70
## alpha: 1.000000
## Removing 3 samples
## 
## 
## Filtering with threshold: 80
## alpha: 1.000000
## Removing 3 samples
## 
## 
## Filtering with threshold: 100
## alpha: 1.000000
## Removing 3 samples
rm(new_entries)

Mean vs SD plot

to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<6] %>%
            as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=6]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/20)) %>% as.character

plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]

Note: Plotting all of the genes is too heavy, so I’m going to filter most of the genes with the highest levels of expression because we care about the behaviour of the genes with low levels

The worst part of the left tail disappears relatively quickly (threshold of around 3 or 4), but it doesn’t seem to disappear completely until a threshold of over 20 or 30 (which is too high to use because we would lose too many genes)

ggplotly(plot_data %>% ggplot(aes(Mean, SD)) + 
         geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) + 
         scale_x_log10() + scale_y_log10() + theme_minimal())
rm(to_keep_1, to_keep_2, plot_data)

The trend line doesn’t actually flatten out with any threshold, just goes a bit crazy in the end (perhaps because it doesn’t have enough points at the lower end of the line)

ggplotly(mean_vs_sd_data %>% ggplot(aes(Mean, SD)) +
              geom_line(stat='smooth', method='loess', se=FALSE, alpha=0.5, 
                        aes(group=threshold, color=threshold)) +
              theme_minimal() + ggtitle('Trend lines for different filtering thresholds'))
plot_data = mean_vs_sd_data %>% group_by(threshold) %>% tally

ggplotly(plot_data %>% ggplot(aes(threshold, n)) + geom_point() + geom_line() + 
         theme_minimal() + ggtitle('Remaining genes for each filtering threshold'))

Conclusion:

I’m not sure which threshold is the best, perhaps 10 is a good compromise between keeping enough genes and removing the worst of the lower trend line, but the problematic trend is not completely gone with it…



Session info

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.23                  dendextend_1.12.0          
##  [3] vsn_3.52.0                  WGCNA_1.68                 
##  [5] fastcluster_1.1.25          dynamicTreeCut_1.63-1      
##  [7] sva_3.32.1                  genefilter_1.66.0          
##  [9] mgcv_1.8-31                 nlme_3.1-143               
## [11] DESeq2_1.24.0               SummarizedExperiment_1.14.1
## [13] DelayedArray_0.10.0         BiocParallel_1.18.1        
## [15] matrixStats_0.54.0          Biobase_2.44.0             
## [17] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
## [19] IRanges_2.18.2              S4Vectors_0.22.0           
## [21] BiocGenerics_0.30.0         biomaRt_2.40.4             
## [23] ggExtra_0.9                 GGally_1.4.0               
## [25] gridExtra_2.3               viridis_0.5.1              
## [27] viridisLite_0.3.0           RColorBrewer_1.1-2         
## [29] plotlyutils_0.0.0.9000      plotly_4.9.0               
## [31] glue_1.3.1                  reshape2_1.4.3             
## [33] forcats_0.4.0               stringr_1.4.0              
## [35] dplyr_0.8.2                 purrr_0.3.3                
## [37] readr_1.3.1                 tidyr_0.8.3                
## [39] tibble_2.1.3                ggplot2_3.2.0              
## [41] tidyverse_1.2.1            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.4        Hmisc_4.2-0           
##   [4] plyr_1.8.4             lazyeval_0.2.2         splines_3.6.2         
##   [7] crosstalk_1.0.0        robust_0.4-18.1        digest_0.6.19         
##  [10] foreach_1.4.7          htmltools_0.3.6        GO.db_3.8.2           
##  [13] magrittr_1.5           checkmate_1.9.4        memoise_1.1.0         
##  [16] fit.models_0.5-14      cluster_2.1.0          doParallel_1.0.15     
##  [19] limma_3.40.6           annotate_1.62.0        modelr_0.1.5          
##  [22] prettyunits_1.0.2      colorspace_1.4-1       blob_1.2.0            
##  [25] rvest_0.3.4            rrcov_1.4-7            haven_2.1.1           
##  [28] xfun_0.8               crayon_1.3.4           RCurl_1.95-4.12       
##  [31] jsonlite_1.6           zeallot_0.1.0          impute_1.58.0         
##  [34] survival_3.1-8         iterators_1.0.12       gtable_0.3.0          
##  [37] zlibbioc_1.30.0        XVector_0.24.0         DEoptimR_1.0-8        
##  [40] scales_1.0.0           mvtnorm_1.0-11         DBI_1.0.0             
##  [43] miniUI_0.1.1.1         Rcpp_1.0.1             xtable_1.8-4          
##  [46] progress_1.2.2         htmlTable_1.13.1       foreign_0.8-73        
##  [49] bit_1.1-14             preprocessCore_1.46.0  Formula_1.2-3         
##  [52] htmlwidgets_1.3        httr_1.4.0             acepack_1.4.1         
##  [55] pkgconfig_2.0.2        reshape_0.8.8          XML_3.98-1.20         
##  [58] nnet_7.3-12            locfit_1.5-9.1         labeling_0.3          
##  [61] tidyselect_0.2.5       rlang_0.4.2            later_0.8.0           
##  [64] AnnotationDbi_1.46.1   munsell_0.5.0          cellranger_1.1.0      
##  [67] tools_3.6.2            cli_1.1.0              generics_0.0.2        
##  [70] RSQLite_2.1.2          broom_0.5.2            evaluate_0.14         
##  [73] yaml_2.2.0             bit64_0.9-7            robustbase_0.93-5     
##  [76] mime_0.7               xml2_1.2.2             compiler_3.6.2        
##  [79] rstudioapi_0.10        curl_3.3               affyio_1.54.0         
##  [82] geneplotter_1.62.0     pcaPP_1.9-73           stringi_1.4.3         
##  [85] lattice_0.20-38        Matrix_1.2-18          vctrs_0.2.0           
##  [88] pillar_1.4.2           BiocManager_1.30.4     data.table_1.12.2     
##  [91] bitops_1.0-6           httpuv_1.5.1           affy_1.62.0           
##  [94] R6_2.4.0               latticeExtra_0.6-28    promises_1.0.1        
##  [97] codetools_0.2-16       MASS_7.3-51.5          assertthat_0.2.1      
## [100] withr_2.1.2            GenomeInfoDbData_1.2.1 hms_0.5.1             
## [103] grid_3.6.2             rpart_4.1-15           rmarkdown_1.13        
## [106] shiny_1.3.2            lubridate_1.7.4        base64enc_0.1-3